" Unit 4 - Lecture 1 "
"------------------------------------------------------------------------"

" Linear Regression "

"------------------------------------------------------------------------"

" Building the Model "
"

<var> = lm(<formula>)

formula = <dep var> ~ <independent var> "

"------------------------------------------------------------------------"

"
Problem:
Load the inbuilt dataset 'mtcars'.

Predict the variable 'mpg' using
all the other variables available in
the data.

Comment on the characteristics on the
model built.

"
View(mtcars)

"
Question:
- Identify Y and Xi's
"

" Building the Model "

" Creating a Model with One Regressor "

plot(mtcars$wt,
     mtcars$mpg)

" mpg = alpha + beta * wt "

model = lm(mpg ~ wt,
           data = mtcars)

S = summary(model)
S$adj.r.squared


"
Adj.R^2 = 1 - (SSres / (n - p - 1)) / (SST / (n - 1))
Adj.R^2 = 1 - (MSSres) / (MSST)

p: no. of Xs

"
anova(model)

1 - (278.32 / (32 - 1 - 1)) / ((278.32+847.73) / (32 - 1))

" OR "

model = lm(mtcars$mpg ~ mtcars$wt)

model


"
Both the above codes to build a 
Linear Regression model is same.
"

"------------------------------------------------------------------------"


" summary(model) provides R^2, Adj R^2,
  Global Testing, 
  Test for Indiviual Regression Coefficient 
"

summary(model)


"
Hypothesis:
H0: alpha = 0
v/s
H1: alpha != 0


H0: Beta = 0
v/s
H1: Beta != 0
"


"
Understand and Interpret each and every
component available in the output.
"

"------------------------------------------------------------------------"

" Making Prediction(s) "

"
Output for fitted.values / predict (if new wt given)
mpg = 37.285 - 5.344 * wt

"
wt = 3.5

" Manual "

37.285 - 5.344 * wt


" Using predict() Function "

# Syntax:
predict(<model>,
        newdata = <Xs>)

predict(model,
        newdata = data.frame(wt = c(3.5,4,3.25)))

predict(model.new,
        newdata = data.frame(wt = c(3.5,4,3.25),
                             hp = c(2,3,4)))

mpg = alpha + beta_1 * wt + beta_2 * hp

"------------------------------------------------------------------------"

" Global Testing "
"
H0: B1 = B2 = ... = Bk = 0
v/s
H1: Atleast one Bi != 0

"

anova(model)

model.new = lm(mpg ~ wt + hp,data = mtcars)

anova(model.new)


"------------------------------------------------------------------------"

" Working with Categorical Variables "

str(mtcars)

mtcars$am = as.factor(mtcars$am)
str(mtcars)


Sales = read.csv(file.choose())

Sales$Ship.Mode = as.factor(Sales$Ship.Mode)
str(Sales)

model.sales = lm(Profit ~ Ship.Mode,
                 data = Sales)


"------------------------------------------------------------------------"

" Model with only Intercept term / Null Model "
"
Y = B0
"
model = lm(mpg ~ 1,
           data = mtcars)


" Model with NO Intercept term "
"
Y = B1 * X
"

model = lm(mpg ~ wt - 1,
           data = mtcars)


" Model with all regressors "
model = lm(<target> ~ .,
           data = )

model = lm(mpg ~ .,
           data = mtcars)

summary(model)

" Model with one regressors "
model = lm(<target> ~ <reg_one>,
           data = )


" Model with two regressors "
model = lm(<target> ~ <reg_one> + <reg_two>,
           data = )

" Model without one regressors "
model = lm(<target> ~ . - <reg_one>,
           data = )

model = lm(mpg ~ . - am - cyl,
           mtcars)

"------------------------------------------------------------------------"

" Plotting the Regression Line "

model = lm(mpg ~ wt,mtcars)

plot(mtcars$wt,
     mtcars$mpg)

abline(model$coefficients[1],
       model$coefficients[2],
       col = "red")

abline(model,
       col = "red")

"------------------------------------------------------------------------"

" Assumptions Testing "
"
1.) Homoscedascticity of erros
2.) No Autocorrelation among residuals
3.) Relationship b/w 'Y' & 'Xi' is linear
4.) Errors ~ Normal

"

plot(model,which = 1)

plot(model$residuals)


X = 0:20
Y = exp(X)

plot(X,Y)

model.check = lm(Y ~ X)
plot(model.check$residuals)


"------------------------------------------------------------------------"

" Question "
" Import the data Body Fat.

1.) Fit a Regression model containing only the
    Intercept 

2.) Fit a Regression Model Containing
    all explanatory variables.
    Comment on the Accuracy.

3.) Verify the assumptions.

4.) Add a variable 'Group' in the data set,
    such that
    Age < 18 : Band 1
    Age >= 18 & Age <= 36 : Band 2
    Else : Band 3
    
    and remove the Age column from the data set.

5.) Fit a new model and comment

"

Data = read.csv(file.choose())

Data$Age = NULL


"1."

model.1 = lm(Density ~ 1,Data)
model.1

"2."

model.2 = lm(Density ~ .,
             Data)

summary(model.2)$adj.r


"3."

plot(model.2)
plot(model.2$residuals)


"------------------------------------------------------------------------"


" Confidence Interval For Beta's "

# Syntax
confint(<model>)

confint(model.2)

"------------------------------------------------------------------------"

" Feature Selection "

" Forward Selection "

model0 = lm(mpg ~ 1,data = mtcars)
summary(model0)$adj.r.squared


model1 = lm(mpg ~ wt,data = mtcars)
summary(model1)$adj.r.squared

model1 = lm(mpg ~ disp,data = mtcars)
summary(model1)$adj.r.squared

model1 = lm(mpg ~ qsec,data = mtcars)
summary(model1)$adj.r.squared


model1.final = lm(mpg ~ wt,mtcars)


model1 = update(model0, .~. + wt)
summary(model1)$adj.r.squared

model1 = update(model0, .~. + disp)
summary(model1)$adj.r.squared

model1 = update(model0, .~. + qsec)
summary(model1)$adj.r.squared


model2 = update(model1.final,
                .~. + disp)

summary(model2)$adj.r

model2 = update(model1.final,
                .~. + qsec)

summary(model2)$adj.r


model2.final = lm(mpg ~ wt + qsec,mtcars)
summary(model2.final)



" Backward Selection "

model0 = lm(mpg ~ wt + qsec + disp,data = mtcars)
summary(model0)$adj.r.squared


model1 = update(model0, .~. - wt)
summary(model1)$adj.r.squared

model1 = update(model0, .~. - disp)
summary(model1)$adj.r.squared

model1 = update(model0, .~. - qsec)
summary(model1)$adj.r.squared


model1.final = update(model0, .~. - disp)
summary(model1.final)$adj.r

model2 = update(model1.final,
                .~. - qsec)

summary(model2)$adj.r

model2 = update(model1.final,
                .~. - wt)

summary(model2)$adj.r


model2.final = model1.final
summary(model2.final)


"------------------------------------------------------------------------"

" Generalized Linear Models "

" Two components.
  Family: Distribution of Y
  Link function "

model = glm(mpg ~ wt,
            data = mtcars,
            family = Gamma(link = "log"))

summary(model)


model = glm(mpg ~ wt,
            data = mtcars)
            

model = glm(mpg ~ wt,
            data = mtcars,
            family = gaussian())

summary(model)


model = lm(mpg ~ wt,
            data = mtcars)

summary(model)


"
Few Family:
Gamma()
gaussian()
poisson()
binomial() 


Few Link Functions:
log
identity
inverse

"

" Plot a Density chart to see the shape of the 
  distribution (cont. distribution) "

plot(density(mtcars$mpg))


"------------------------------------------------------------------------"

" Introducing AIC "
" AIC: Akaike's Information Criteria "

" Know the rules about AIC "
"
- Lower the AIC, the better it is
- AIC can be < 0
"
"------------------------------------------------------------------------"


model.0 = lm(mpg ~ 1,mtcars)
model.1 = lm(mpg ~ wt,mtcars)
model.2 = lm(mpg ~ wt + qsec,mtcars)
model.3 = lm(mpg ~ wt + qsec + disp,mtcars)

mtcars$G1 = runif(32,-100000,100000)
mtcars$G2 = runif(32,-100000,100000)

model.4 = lm(mpg ~ wt + qsec + 
               disp + G1 + G2,mtcars)

AIC(model.4)


AIC(model.0)
AIC(model.1)
AIC(model.2)
AIC(model.3)
AIC(model.4)
